Rendering HTML¶

In [ ]:
import plotly
plotly.offline.init_notebook_mode()

Framing the Problem¶

Problem is the risk analysis of patients with diabetes.

Import libraries & Load Dataset¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import datasets

# Load the diabetes dataset
data_diabetes = datasets.load_diabetes()
data_diabetes
Out[ ]:
{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
         42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
         83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
        104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
        173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
        107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
         60., 174., 259., 178., 128.,  96., 126., 288.,  88., 292.,  71.,
        197., 186.,  25.,  84.,  96., 195.,  53., 217., 172., 131., 214.,
         59.,  70., 220., 268., 152.,  47.,  74., 295., 101., 151., 127.,
        237., 225.,  81., 151., 107.,  64., 138., 185., 265., 101., 137.,
        143., 141.,  79., 292., 178.,  91., 116.,  86., 122.,  72., 129.,
        142.,  90., 158.,  39., 196., 222., 277.,  99., 196., 202., 155.,
         77., 191.,  70.,  73.,  49.,  65., 263., 248., 296., 214., 185.,
         78.,  93., 252., 150.,  77., 208.,  77., 108., 160.,  53., 220.,
        154., 259.,  90., 246., 124.,  67.,  72., 257., 262., 275., 177.,
         71.,  47., 187., 125.,  78.,  51., 258., 215., 303., 243.,  91.,
        150., 310., 153., 346.,  63.,  89.,  50.,  39., 103., 308., 116.,
        145.,  74.,  45., 115., 264.,  87., 202., 127., 182., 241.,  66.,
         94., 283.,  64., 102., 200., 265.,  94., 230., 181., 156., 233.,
         60., 219.,  80.,  68., 332., 248.,  84., 200.,  55.,  85.,  89.,
         31., 129.,  83., 275.,  65., 198., 236., 253., 124.,  44., 172.,
        114., 142., 109., 180., 144., 163., 147.,  97., 220., 190., 109.,
        191., 122., 230., 242., 248., 249., 192., 131., 237.,  78., 135.,
        244., 199., 270., 164.,  72.,  96., 306.,  91., 214.,  95., 216.,
        263., 178., 113., 200., 139., 139.,  88., 148.,  88., 243.,  71.,
         77., 109., 272.,  60.,  54., 221.,  90., 311., 281., 182., 321.,
         58., 262., 206., 233., 242., 123., 167.,  63., 197.,  71., 168.,
        140., 217., 121., 235., 245.,  40.,  52., 104., 132.,  88.,  69.,
        219.,  72., 201., 110.,  51., 277.,  63., 118.,  69., 273., 258.,
         43., 198., 242., 232., 175.,  93., 168., 275., 293., 281.,  72.,
        140., 189., 181., 209., 136., 261., 113., 131., 174., 257.,  55.,
         84.,  42., 146., 212., 233.,  91., 111., 152., 120.,  67., 310.,
         94., 183.,  66., 173.,  72.,  49.,  64.,  48., 178., 104., 132.,
        220.,  57.]),
 'frame': None,
 'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n',
 'feature_names': ['age',
  'sex',
  'bmi',
  'bp',
  's1',
  's2',
  's3',
  's4',
  's5',
  's6'],
 'data_filename': 'diabetes_data_raw.csv.gz',
 'target_filename': 'diabetes_target.csv.gz',
 'data_module': 'sklearn.datasets.data'}

Converting to dataframe¶

In [ ]:
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['target'] = data_diabetes.target
df_diabetes.head()
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0

Exploratory Data Analysis¶

Descibe the data¶

In [ ]:
df_describe = df_diabetes.describe()
df_describe
Out[ ]:
age sex bmi bp s1 s2 s3 s4 s5 s6 target
count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 442.000000
mean -2.511817e-19 1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17 3.918434e-17 -5.777179e-18 -9.042540e-18 9.293722e-17 1.130318e-17 152.133484
std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 77.093005
min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01 -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260971e-01 -1.377672e-01 25.000000
25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665608e-02 -3.424784e-02 -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324559e-02 -3.317903e-02 87.000000
50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670422e-03 -4.320866e-03 -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947171e-03 -1.077698e-03 140.500000
75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564379e-02 2.835801e-02 2.984439e-02 2.931150e-02 3.430886e-02 3.243232e-02 2.791705e-02 211.500000
max 1.107267e-01 5.068012e-02 1.705552e-01 1.320436e-01 1.539137e-01 1.987880e-01 1.811791e-01 1.852344e-01 1.335973e-01 1.356118e-01 346.000000
  • Dataset contains 442 data.
  • For all the features mean close to 0 and standard deviation close to 1. But for the target variable mean is 152 & standard deviation is 77 which indicates variability.

Plot graphs for each feature and target to find the insights¶

In [ ]:
df_diabetes.hist(figsize=(12,10))
plt.show()
  • The histograms indicates that age,bmi,bp,s1,s2,s3,s4,s5,s6 are centered around mean.
  • But the target variable is rightly skewed, that means higher number of patients with lower risk of diabetes progression and there are only less number of patients with high risk of diabetes progression.

Correlation Matrix¶

In [ ]:
df_diabetes_corr = df_diabetes.corr()
print(df_diabetes_corr)
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Heatmap of Correlation Matrix of Diabetes Dataset')
plt.show()
             age       sex       bmi        bp        s1        s2        s3  \
age     1.000000  0.173737  0.185085  0.335428  0.260061  0.219243 -0.075181   
sex     0.173737  1.000000  0.088161  0.241010  0.035277  0.142637 -0.379090   
bmi     0.185085  0.088161  1.000000  0.395411  0.249777  0.261170 -0.366811   
bp      0.335428  0.241010  0.395411  1.000000  0.242464  0.185548 -0.178762   
s1      0.260061  0.035277  0.249777  0.242464  1.000000  0.896663  0.051519   
s2      0.219243  0.142637  0.261170  0.185548  0.896663  1.000000 -0.196455   
s3     -0.075181 -0.379090 -0.366811 -0.178762  0.051519 -0.196455  1.000000   
s4      0.203841  0.332115  0.413807  0.257650  0.542207  0.659817 -0.738493   
s5      0.270774  0.149916  0.446157  0.393480  0.515503  0.318357 -0.398577   
s6      0.301731  0.208133  0.388680  0.390430  0.325717  0.290600 -0.273697   
target  0.187889  0.043062  0.586450  0.441482  0.212022  0.174054 -0.394789   

              s4        s5        s6    target  
age     0.203841  0.270774  0.301731  0.187889  
sex     0.332115  0.149916  0.208133  0.043062  
bmi     0.413807  0.446157  0.388680  0.586450  
bp      0.257650  0.393480  0.390430  0.441482  
s1      0.542207  0.515503  0.325717  0.212022  
s2      0.659817  0.318357  0.290600  0.174054  
s3     -0.738493 -0.398577 -0.273697 -0.394789  
s4      1.000000  0.617859  0.417212  0.430453  
s5      0.617859  1.000000  0.464669  0.565883  
s6      0.417212  0.464669  1.000000  0.382483  
target  0.430453  0.565883  0.382483  1.000000  
  • s3 shows very less correlation with other features and taget.
  • All the features except s3 shows a positive correlation with target variable.
  • BMI shows highest correlation with target among the features. That means change in BMI affects risk of diabetes progression high when comapred to other features.
  • s5 also has good enough correlation with target value.
  • So, BMI & s5 has more importance in predicting the risk of diabetes progression.

Cleaning the data¶

  • The given dataset is need not to be cleaned. Because it is mean centered to 0 and scaled by standard deviation of 1.

Split the dataset¶

In [ ]:
from sklearn.model_selection import train_test_split

# We are taking only BMI & Taget columns because it has already stated that 
# BMI is the independent variable and tage is the dependent variable.

X = df_diabetes[['bmi']]
y= df_diabetes['target']

X_train, X_old, y_train, y_old = train_test_split(X, y, test_size=0.3)

X_val,X_test,y_val,y_test = train_test_split(X_old, y_old, test_size=0.5)

print(X_train.shape, X_val.shape, X_test.shape)
(309, 1) (66, 1) (67, 1)

Polynomial Regression on BMI v/s diesease progression¶

In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

def create_poly_model(X,y,degrees):
    models = {}
    for degree in degrees:
        model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
                          ('linear', LinearRegression())])
        model.fit(X, y)
        models[degree] = model
    return models
    

degrees = list(range(0, 6))
models = create_poly_model(X_train, y_train, degrees)

# print models
for degree, model in models.items():
    print(f'Degree: {degree}')
    print(f'Model: {model}\n')
Degree: 0
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
                ('linear', LinearRegression())])

Degree: 1
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())])

Degree: 2
Model: Pipeline(steps=[('polynomial', PolynomialFeatures()),
                ('linear', LinearRegression())])

Degree: 3
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression())])

Degree: 4
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
                ('linear', LinearRegression())])

Degree: 5
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
                ('linear', LinearRegression())])

Comapring Models¶

Report : R-Squared,MAPE,MSE for all models¶

In [ ]:
from sklearn.metrics import r2_score, mean_absolute_error

# MAPE function
def mape(y_act, y_pred):
    return np.mean(np.abs((y_act - y_pred) / y_act)) * 100

for degree, model in models.items():
    # Predictions for train and validation set
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    print(f'Degree: {degree}')
    # Train Data
    print(f'    Train R2: {r2_score(y_train, y_train_pred)}')
    print(f'    Train MAE: {mean_absolute_error(y_train, y_train_pred)}')
    print(f'    Train MAPE: {mape(y_train, y_train_pred)}\n')
   
    # Validation Data
    print(f'    Val R2: {r2_score(y_val, y_val_pred)}')
    print(f'    Val MAE: {mean_absolute_error(y_val, y_val_pred)}')
    print(f'    Val MAPE: {mape(y_val, y_val_pred)}\n')
Degree: 0
    Train R2: 0.0
    Train MAE: 63.50733653815942
    Train MAPE: 59.58147132189093

    Val R2: -0.02955490098489255
    Val MAE: 68.57119741100324
    Val MAPE: 62.47693581109562

Degree: 1
    Train R2: 0.3468251148566268
    Train MAE: 50.36855418910475
    Train MAPE: 45.737787569038275

    Val R2: 0.2256254729488586
    Val MAE: 56.50933674689731
    Val MAPE: 49.20006808168624

Degree: 2
    Train R2: 0.34682520948663176
    Train MAE: 50.368315074360446
    Train MAPE: 45.73754242992912

    Val R2: 0.2255975105856577
    Val MAE: 56.51124502126944
    Val MAPE: 49.20155695263188

Degree: 3
    Train R2: 0.34686782034381825
    Train MAE: 50.32425283843269
    Train MAPE: 45.692923263547186

    Val R2: 0.2248781164960707
    Val MAE: 56.56343809008254
    Val MAPE: 49.24369812392924

Degree: 4
    Train R2: 0.3481049919424113
    Train MAE: 50.17500402983993
    Train MAPE: 45.43122002324573

    Val R2: 0.2227122771160418
    Val MAE: 56.84880526558141
    Val MAPE: 49.49210413226087

Degree: 5
    Train R2: 0.35815333765033563
    Train MAE: 49.75626892720575
    Train MAPE: 44.92705706177226

    Val R2: 0.22449317094016064
    Val MAE: 55.95407571564134
    Val MAPE: 49.053396985860054

  • Model with degree 1 is the best model.
  • It has better R-sqaured value for validation data than other models an have a comapareable value for training. So, it helps to predict the unseen data.

Conclusion¶

Run the Chosen model with test data¶

In [ ]:
from sklearn.metrics import mean_squared_error

models[1].fit(X_test,y_test)  # 5 is the degree of the polynomial model in the models dictionary
y_test_pred = models[1].predict(X_test)

# Evaluating the model with test data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAE: {mae_test}')
Test Data
R2 score: 0.4451011602363443
MSE: 3855.4386220341016
MAE: 50.40507077901542
In [ ]:
# Plotting the model
plt.figure(figsize=(10,8))
plt.scatter(X_train, y_train, color='blue', label='Train Data')
plt.scatter(X_val, y_val, color='red', label='Validation Data')
plt.scatter(X_test, y_test, color='green', label='Test Data')

plt.plot(X_test, y_test_pred, color='black', label='Degree 1 test data')
plt.plot(X_train, models[1].predict(X_train), color='yellow', label='Degree 1 train data')
plt.plot(X_val, models[1].predict(X_val), color='orange', label='Degree 1 val data')

plt.title('Polynomial Regression with model degree 1')
plt.xlabel('BMI')
plt.ylabel('Target')
plt.legend()
plt.show()

Equation of the predicted model¶

In [ ]:
def print_pipeline_model_stats(model):
    # print model    
    print(f'Model: {model}')
    print(f'Coefficients: {model[-1].coef_}')
    print(f'Intercept: {model[-1].intercept_}')
    # generate equation string:
    equation = 'y = '
    for i, coef in enumerate(model[-1].coef_):
        equation += f'{coef:.2f} * x^{i} + '
    equation += f'{model[-1].intercept_:.2f}'
    print(f'Equation: {equation}')

print_pipeline_model_stats(models[1])
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())])
Coefficients: [   0.         1195.15601307]
Intercept: 161.77667890765423
Equation: y = 0.00 * x^0 + 1195.16 * x^1 + 161.78

Predict manually with bmi value = 0.05¶

In [ ]:
bmi_manual = 0.05
y_pred_manual = 1195.16 * (bmi_manual)**1 +  161.78
print(y_pred_manual)

y_model = models[1].predict([[bmi_manual]])
print(y_model)
221.538
[221.53447956]
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\sklearn\base.py:439: UserWarning:

X does not have valid feature names, but PolynomialFeatures was fitted with feature names

  • After predicting and manual calculation we got same value for both.

Trainable Parameters for 6 models¶

In [ ]:
trainable_params = {}
for degree in range(6):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X_train)
    params_count = X_poly.shape[1] 
    trainable_params[degree] = params_count
    print(f'Degree {degree}: {poly.get_feature_names_out()}')

degrees = list(trainable_params.keys())
params = list(trainable_params.values())
print('Degrees:', degrees)
print('Trainable Parameters:', params)
Degree 0: ['1']
Degree 1: ['1' 'bmi']
Degree 2: ['1' 'bmi' 'bmi^2']
Degree 3: ['1' 'bmi' 'bmi^2' 'bmi^3']
Degree 4: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4']
Degree 5: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4' 'bmi^5']
Degrees: [0, 1, 2, 3, 4, 5]
Trainable Parameters: [1, 2, 3, 4, 5, 6]